####################
## Name: Carlos Gould
## Project: Census Data Analysis for Ecuador 
####################

# 0. Libraries ------

library(broom)
library(tidyverse)
library(ggthemes)
library(mgcViz)
library(MASS)
library(gamm4)
library(mgcv)
library(splines)
library(broom)
library(ggdist)
library(specr)

# 1. Run beginning of regression5 code ---------

# 2. Specification plots --------

# 2.1 glm.qp Formulaa -------

glm.qp.empty <- function(formula, data) {
  require(broom)
  
  formula <- formula
  
  glm(
    formula = formula,
    data = data,
    offset = log(under5population),
    family = quasipoisson()
  )
}

glm.qp <- function(formula, data, ...) {
  require(broom)
  
  formula <- paste(formula, "+ canton_id_universal + period")
  
  glm(
    formula = formula,
    data = data,
    offset = log(under5population),
    family = quasipoisson()
  )
}



# 2.2 Set up variable naming conventions -------

full_df_2 <- full_df %>% 
  mutate(ru = percent_rural_corrected_ratio,
         el = electricidad_no,
         rf = material_techo_nicer_all,
         wl = material_pared_hormigon_bloque_ladrillo,
         fl = material_piso_nicer,
         mts = materials_index,
         wa = obtiene_agua_redpublica_tuberia_adentro_use,
         sw = elimina_servidas_red_pozo_letrina,
         sh = servicio_ducha_excluso,
         hyi = hygiene_index,
         lw = Literate_women,
         sg = InSchool_girls,
         ii = Idioma_indigena_self,
         mv = SAR_use,
         pv = pcv3_use,
         vi = vaccine_index,
         an = ANC_use,
         anv = ANC_visits_median_use)





# Full variables ---------


setup_indices_all <- specr::setup_specs(y = c("ALRI5_5_plus"), 
                                    x = c("cf10"), 
                                    model = c("glm.qp.empty"),
                                    all.comb = TRUE,
                                    controls = c("ru",
                                                 "el",
                                                 
                                                 "rf",
                                                 "wl",
                                                 "fl",
                                                 "mts",
                                                 
                                                 "wa",
                                                 "sh",
                                                 "sw",
                                                 "hyi",

                                                 "lw",
                                                 "sg",
                                                 "ii",
                                                 
                                                 # "mv",
                                                 "pv",
                                                 "vi",
                                                 
                                                 "an",
                                                 "anv"
                                    )) %>% 
  mutate(number = row_number()) %>% 
  mutate(full_model = paste0("ALRI5_5_plus ~ offset(log(under5population)) + canton_id_universal + period + cf10 + ", controls))

# Remove cases where we have a component of an index and the index 

materials_components <- c("rf", "wl", "fl")
materials_index <- c("mts")
hygiene_components <- c("wa", "sh", "sw")
hygiene_index <- c("hyi")


tmp <- setup_indices_all %>% 
  mutate(has_materials_component = ifelse(str_detect(controls, materials_components)==TRUE, 1, 0),
         has_materials_index = ifelse(str_detect(controls, materials_index)==TRUE, 1, 0),
         has_hygiene_component = ifelse(str_detect(controls, hygiene_components)==TRUE, 1, 0),
         has_hygiene_index = ifelse(str_detect(controls, hygiene_index)==TRUE, 1, 0)) %>% 
  mutate(materials_conflict = ifelse(has_materials_component==1 &
                                       has_materials_index==1, 1, 0),
         hygiene_conflict = ifelse(has_hygiene_component==1 &
                                     has_hygiene_index==1, 1, 0))

tmp1 <- tmp %>% 
  filter(hygiene_conflict==1)

setup_indices_all_clean <- setup_indices_all %>% 
  mutate(has_materials_component = ifelse(str_detect(controls, materials_components)==TRUE, 1, 0),
         has_materials_index = ifelse(str_detect(controls, materials_index)==TRUE, 1, 0),
         has_hygiene_component = ifelse(str_detect(controls, hygiene_components)==TRUE, 1, 0),
         has_hygiene_index = ifelse(str_detect(controls, hygiene_index)==TRUE, 1, 0)) %>% 
  mutate(materials_conflict = ifelse(has_materials_component==1 &
                                       has_materials_index==1, 1, 0),
         hygiene_conflict = ifelse(has_hygiene_component==1 &
                                     has_hygiene_index==1, 1, 0)) %>% 
  filter(materials_conflict == 0 & hygiene_conflict==0)


# Hack together specr myself -------

full_coefs_indices_df <- setup_indices_all_clean[1,] 

for (i in 1:5000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df <- full_coefs_indices_df %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

# write_rds(full_coefs_indices_df, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_1_5000.rds")
# rm(full_coefs_indices_df)

full_coefs_indices_df_2 <- setup_indices_all_clean[1,] 

tictoc::tic()
Sys.time()
for (i in 5001:15000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_2 <- full_coefs_indices_df_2 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}
tictoc::toc()

# write_rds(full_coefs_indices_df_2, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_5001_15000.rds")
# rm(full_coefs_indices_df_2)

full_coefs_indices_df_3 <- setup_indices_all_clean[1,] 

for (i in 15001:25000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_3 <- full_coefs_indices_df_3 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_3, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_15001_25000.rds")
rm(full_coefs_indices_df_3)



full_coefs_indices_df_4 <- setup_indices_all_clean[1,] 

for (i in 25001:35000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_4 <- full_coefs_indices_df_4 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_4, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_25001_35000.rds")
rm(full_coefs_indices_df_4)



full_coefs_indices_df_5 <- setup_indices_all_clean[1,] 


for (i in 35001:45000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_5 <- full_coefs_indices_df_5 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_5, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_35001_45000.rds")
rm(full_coefs_indices_df_5)


full_coefs_indices_df_6 <- setup_indices_all_clean[1,] 


for (i in 45001:55000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_6 <- full_coefs_indices_df_6 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_6, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_45001_55000.rds")
rm(full_coefs_indices_df_6)




full_coefs_indices_df_7 <- setup_indices_all_clean[1,] 


for (i in 55001:65000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_7 <- full_coefs_indices_df_7 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_7, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_55001_65000.rds")
rm(full_coefs_indices_df_7)



full_coefs_indices_df_8 <- setup_indices_all_clean[1,] 


for (i in 65001:75000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_8 <- full_coefs_indices_df_8 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_8, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_65001_75000.rds")
rm(full_coefs_indices_df_8)




full_coefs_indices_df_9 <- setup_indices_all_clean[1,] 


for (i in 75001:85000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_9 <- full_coefs_indices_df_9 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_9, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_75001_85000.rds")
rm(full_coefs_indices_df_9)





full_coefs_indices_df_10 <- setup_indices_all_clean[1,] 


for (i in 85001:95000) {
  mod_ran <- glm(as.formula(setup_indices_all_clean$full_model[i]), data=full_df_2, family=quasipoisson())
  
  coefs_df <- tidy(mod_ran) %>% 
    filter(term == "cf10") %>% 
    dplyr::select(term, estimate, std.error, p.value) %>% 
    mutate(full_model = setup_indices_all_clean$full_model[i])
  
  full_coefs_indices_df_8 <- full_coefs_indices_df_8 %>% 
    bind_rows(coefs_df)
  
  print(i)
  
}

write_rds(full_coefs_indices_df_8, "~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_65001_75000.rds")
rm(full_coefs_indices_df_8)



# Analyze results and plot -------

full_coefs_indices_df <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_1_5000.rds")
full_coefs_indices_df_2 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_5001_15000.rds")
full_coefs_indices_df_3 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_15001_25000.rds")
full_coefs_indices_df_4 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_25001_35000.rds")
full_coefs_indices_df_5 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_35001_45000.rds")
full_coefs_indices_df_6 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_45001_55000.rds")
full_coefs_indices_df_7 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_55001_65000.rds")
full_coefs_indices_df_8 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_65001_75000.rds")
full_coefs_indices_df_9 <- read_rds("~/Dropbox/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/specr_july21_75001_85000.rds")

full_coefs_indices_df_combined <- full_coefs_indices_df %>% 
  bind_rows(full_coefs_indices_df_2) %>% 
  bind_rows(full_coefs_indices_df_3) %>% 
  bind_rows(full_coefs_indices_df_4) %>% 
  bind_rows(full_coefs_indices_df_5) %>% 
  bind_rows(full_coefs_indices_df_6) %>% 
  bind_rows(full_coefs_indices_df_7) %>% 
  bind_rows(full_coefs_indices_df_8) %>% 
  bind_rows(full_coefs_indices_df_9) %>% 
  mutate(irr = exp(estimate),
         irr.low = exp(estimate-1.96*std.error),
         irr.high = exp(estimate+1.96*std.error)) %>% 
  arrange(irr) %>% 
  mutate(number = row_number(),
         has_materials_index = ifelse(str_detect(controls, materials_index)==TRUE, 1, 0),
         has_hygiene_index = ifelse(str_detect(controls, hygiene_index)==TRUE, 1, 0),
         p5 = ifelse(irr.high<=1, "P < 0.05; RR < 1.00",
                     ifelse(irr<=1, "P>0.05; RR < 1.00",
                            ifelse(irr>1, "P>0.05; RR > 1.00", NA))))

table(full_coefs_indices_df_combined$irr.high<1)
table(full_coefs_indices_df_combined$irr<1)

full_coefs_indices_df_combined %>% 
  summarize(mean = mean(irr, na.rm=T),
            sd = sd(irr, na.rm=T),
            ten = quantile(irr, probs=0.10, na.rm=T),
            twentyfive = quantile(irr, probs=0.25, na.rm=T),
            median = median(irr, na.rm=T),
            seventyfive = quantile(irr, probs=0.75, na.rm=T),
            ninety = quantile(irr, probs=0.90, na.rm=T))

specr_indices_fig <- ggplot(full_coefs_indices_df_combined %>% 
                              filter(!is.na(p5)), aes(x=number, 
                                                        y=irr,
                                                        ymin=irr.low,
                                                        ymax=irr.high, color=p5)) + 
  geom_hline(yintercept=1, size=1) +
  geom_hline(yintercept=0.90, linetype=4, size=0.5) +
  geom_hline(yintercept=.79, linetype=4, size=0.5) +
  geom_linerange(alpha=0.3) +
  geom_point(color="white", size=0.2) +
  ggtitle("Full potential confounder combination specification plot",
          subtitle = "Quasi-poisson generalized linear models") +
  geom_label(aes(x=545, y=0.9125, label = "Preferred specification"), color="black", label.size = NA, size=4, family="Helvetica") +
  geom_label(aes(x=-100, y=0.805, label = "Empty model"), color="black", label.size = NA, size=3, family="Helvetica") + 
  geom_label(aes(x=60000, y=1.10, label = "In 98.7% of models, RR<1.00\nIn 70.3% of models, P<0.05\nN=73,818 Total Models"), 
             label.size = NA, color="black", fill="white") +
  scale_y_log10(breaks=c(0.75, 0.8, 0.85, 0.9, 0.95, 1, 1.05, 1.1, 1.15, 1.2)) +
  ggsci::scale_color_lancet() + 
  ggthemes::theme_clean() +
  ylab("Rate ratio (95% CI)") +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        plot.title = element_text(size=16, face="bold"),
        axis.title.y = element_text(size=14),
        legend.title = element_blank(),
        axis.text.y = element_text(size=13),
        axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks = element_blank(),
        plot.background = element_blank())

specr_indices_fig

# cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/specr_indices_fig_sept2021.png",
#                  specr_indices_fig,
#                  dpi=300,
#                  height=150,
#                  width = 400,
#                  unit = "mm")

